CGC clustering¶

Install some packages¶

pip install matplotlib¶
pip install pandas¶
pip install joblib¶
pip install sklearn¶
pip install plotly¶

In your terminal, type "python" to start the python.¶

Load Libraries¶

In [1]:
# for dealing with file directories
import os

# for making the plots
import matplotlib.pyplot as plt

# for dealing with dataframes
import pandas as pd

# for parallel computation
from joblib import Parallel, delayed

# for k mean clustering
from sklearn.cluster import KMeans

# for converting text sequences to features
from sklearn.feature_extraction.text import CountVectorizer

1. Prepare Data¶

In [2]:
# Set the your working directory
# os.chdir(r"/work/$your_group/$your_name/workshop_2022/CGC_clustering")
# example:
os.chdir(r"/work/yinlab/tangli/workshop_2022/CGC_clustering")
In [3]:
# Read dataframe
all_unsupervised = pd.read_csv("cgc_seq_substr_list.txt", sep="\t", header=None)
In [4]:
# Name columns for the dataframe
all_unsupervised.columns = [
    "Genome_ID",
    "CGC_ID",
    "sig_gene_seq",
    "low_level_substr",
    "Species",
]
In [5]:
all_unsupervised.head()
Out[5]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375
4 MGYG000000013 MGYG000000013_1|CGC5 GT2,1.E.14,2.A.122,GH105,PL42,GH43_24,GH154,HT... lactose,host glycan Bacteroides sp902362375
In [6]:
# View the shape of dataframe
all_unsupervised.shape
Out[6]:
(3352, 5)
In [7]:
# There are a few duplicates in the dataframe
all_unsupervised["sig_gene_seq"].value_counts()
Out[7]:
9.B.146,2.A.103,GT28                               24
GT28,2.A.103,9.B.146                               18
CBM50,3.A.5                                        17
1.B.14,8.A.46,GH18                                 16
3.A.5,CBM50                                        15
                                                   ..
2.A.66,GH16_3                                       1
GH95,1.B.14,GH2,GH35,GH43_10,1.B.14                 1
1.B.42,Aminotran_1_2,CBM67|GH78                     1
3.A.3,3.A.3,3.A.3,2.A.21,GerE,FecR,1.B.14,GH115     1
1.A.43,GH3,GH158,GH16_3,1.B.14                      1
Name: sig_gene_seq, Length: 2463, dtype: int64
In [8]:
# Show an example for duplicates
all_unsupervised[all_unsupervised["sig_gene_seq"] == "9.B.146,2.A.103,GT28"]
Out[8]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species
10 MGYG000000013 MGYG000000013_1|CGC11 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp902362375
266 MGYG000000057 MGYG000000057_3|CGC2 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp002491635
396 MGYG000000105 MGYG000000105_3|CGC2 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides clarus
669 MGYG000000224 MGYG000000224_6|CGC5 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp003545565
707 MGYG000000236 MGYG000000236_3|CGC7 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides fragilis_A
888 MGYG000000652 MGYG000000652_11|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides togonis
903 MGYG000000675 MGYG000000675_1|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides congonensis
1288 MGYG000001345 MGYG000001345_36|CGC11 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides xylanisolvens
1537 MGYG000001378 MGYG000001378_14|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides ovatus
1586 MGYG000001422 MGYG000001422_2|CGC10 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides oleiciplenus
1701 MGYG000001433 MGYG000001433_1|CGC39 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides salyersiae
1759 MGYG000001461 MGYG000001461_1|CGC10 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides neonati
1826 MGYG000001661 MGYG000001661_1|CGC8 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides gallinarum
1927 MGYG000002273 MGYG000002273_9|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides stercorirosoris
2392 MGYG000002549 MGYG000002549_18|CGC7 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides caccae
2566 MGYG000003064 MGYG000003064_3|CGC7 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides graminisolvens
2714 MGYG000003252 MGYG000003252_545|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900761785
2739 MGYG000003312 MGYG000003312_4|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900547205
2922 MGYG000003363 MGYG000003363_45|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900766195
3004 MGYG000003367 MGYG000003367_48|CGC5 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp007896885
3157 MGYG000004019 MGYG000004019_6|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides massiliensis_A
3195 MGYG000004185 MGYG000004185_1|CGC6 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900553815
3240 MGYG000004676 MGYG000004676_2|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900766005
3310 MGYG000004899 MGYG000004899_9|CGC2 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900555635
In [9]:
# Remove duplicates
all_unsupervised = all_unsupervised.drop_duplicates("sig_gene_seq")
all_unsupervised.shape
Out[9]:
(2463, 5)
In [10]:
# Write out a csv file
all_unsupervised.to_csv("unsupervised_cgc_data.tsv", index=False, sep="\t")
In [11]:
all_unsupervised.head()
Out[11]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375
4 MGYG000000013 MGYG000000013_1|CGC5 GT2,1.E.14,2.A.122,GH105,PL42,GH43_24,GH154,HT... lactose,host glycan Bacteroides sp902362375
In [12]:
# Some low-level subratrates have multiple categories in one string
# removing those
all_unsupervised["lens"] = [
    len(str(seq).split(",")) for seq in all_unsupervised["low_level_substr"]
]
In [13]:
# Keep only the clean ones (one subratrate)
all_unsupervised = all_unsupervised[all_unsupervised["lens"] == 1]
all_unsupervised
Out[13]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species lens
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375 1
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375 1
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375 1
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375 1
5 MGYG000000013 MGYG000000013_1|CGC6 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... arabinoxylan Bacteroides sp902362375 1
... ... ... ... ... ... ...
3340 MGYG000004899 MGYG000004899_23|CGC1 3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_... xylan Bacteroides sp900555635 1
3342 MGYG000004899 MGYG000004899_28|CGC1 GT0,GT4,9.B.18,GT0 capsule polysaccharide Bacteroides sp900555635 1
3343 MGYG000004899 MGYG000004899_30|CGC1 2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2... beta-glucan Bacteroides sp900555635 1
3344 MGYG000004899 MGYG000004899_31|CGC1 1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8.... host glycan Bacteroides sp900555635 1
3345 MGYG000004899 MGYG000004899_32|CGC1 PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.... rhamnogalacturonan Bacteroides sp900555635 1

1931 rows × 6 columns

In [14]:
# all_unsupervised = pd.concat([all_unsupervised_clean], ignore_index = True)
In [15]:
# Count the number of cgc sequences for each subratrate
all_unsupervised["low_level_substr"].value_counts()
Out[15]:
host glycan               346
capsule polysaccharide    201
rhamnogalacturonan        122
glycosaminoglycan         120
pectin                    108
                         ... 
fucosyllactose              1
cellobiose                  1
trehalose                   1
sorbitol                    1
beta-glucoside              1
Name: low_level_substr, Length: 64, dtype: int64
In [16]:
all_unsupervised["low_level_substr"]
Out[16]:
0                galactomannan
1                 alpha-mannan
2                  host glycan
3                       starch
5                 arabinoxylan
                 ...          
3340                     xylan
3342    capsule polysaccharide
3343               beta-glucan
3344               host glycan
3345        rhamnogalacturonan
Name: low_level_substr, Length: 1931, dtype: object
In [17]:
# Need to decide on delimeters
all_unsupervised[["sig_gene_seq"]].values[-1][0]
Out[17]:
'PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.B.14,CBM67|GH78,HTH_AraC,GH106'
In [18]:
# Treat "|" as ","
all_unsupervised[["sig_gene_seq"]].values[-1][0].replace("|", ",")
Out[18]:
'PL8_3,CBM67,GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.B.14,CBM67,GH78,HTH_AraC,GH106'
In [19]:
# Split the cgc sequences
all_unsupervised[["sig_gene_seq"]].values[-1][0].replace("|", ",").split(",")
Out[19]:
['PL8_3',
 'CBM67',
 'GH78',
 'GH28',
 'GH88',
 'GH2',
 'PL8_3',
 '8.A.46',
 '1.B.14',
 'CBM67',
 'GH78',
 'HTH_AraC',
 'GH106']
In [20]:
all_unsupervised.head()
Out[20]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species lens
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375 1
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375 1
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375 1
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375 1
5 MGYG000000013 MGYG000000013_1|CGC6 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... arabinoxylan Bacteroides sp902362375 1
In [21]:
# check if there is missing value
all_unsupervised.isnull().sum()
# Drop missing values if possible
# all_unsupervised = all_unsupervised.dropna()
Out[21]:
Genome_ID           0
CGC_ID              0
sig_gene_seq        0
low_level_substr    0
Species             0
lens                0
dtype: int64
In [22]:
# Only keep subratrate classes which have more than 2 occurrences
count_df = pd.DataFrame(
    all_unsupervised["low_level_substr"].value_counts()[
        all_unsupervised["low_level_substr"].value_counts() >= 2
    ]
).reset_index()
In [23]:
# Get their names
to_keep_substrates = count_df["index"].values
to_keep_substrates
Out[23]:
array(['host glycan', 'capsule polysaccharide', 'rhamnogalacturonan',
       'glycosaminoglycan', 'pectin', 'alpha-mannan', 'N-glycan',
       'homogalacturonan', 'xylan', 'arabinoxylan', 'porphyran',
       'hemicellulose', 'mucin', 'acetylated glucuronoxylan',
       'beta-glucan', ' ', 'plant polysaccharide', 'xyloglucan',
       'arabinogalactan', 'fructan', 'O-antigen', 'galactomannan',
       'exopolysaccharide', 'sialoglycoconjugate', 'dextran', 'starch',
       'arabinan', 'cellulose', 'glycogen', 'carrageenan', 'alginate',
       'maltose', 'beta-mannan', 'O-glycan', 'pullulan', 'mannose',
       'alpha-glucan', 'galactan', 'sialic acid', 'ribose', 'chitin',
       'xanthan', 'arabinose', 'maltooligosaccharide',
       '4-methylumbelliferyl 6-azido-6-deoxy-beta-D-galactoside',
       'acarbose', 'glucomannan', 'glucose'], dtype=object)
In [24]:
# Subset to only those that are more than 2
all_unsupervised = all_unsupervised[
    all_unsupervised["low_level_substr"].isin(to_keep_substrates)
]
In [25]:
all_unsupervised
Out[25]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species lens
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375 1
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375 1
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375 1
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375 1
5 MGYG000000013 MGYG000000013_1|CGC6 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... arabinoxylan Bacteroides sp902362375 1
... ... ... ... ... ... ...
3340 MGYG000004899 MGYG000004899_23|CGC1 3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_... xylan Bacteroides sp900555635 1
3342 MGYG000004899 MGYG000004899_28|CGC1 GT0,GT4,9.B.18,GT0 capsule polysaccharide Bacteroides sp900555635 1
3343 MGYG000004899 MGYG000004899_30|CGC1 2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2... beta-glucan Bacteroides sp900555635 1
3344 MGYG000004899 MGYG000004899_31|CGC1 1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8.... host glycan Bacteroides sp900555635 1
3345 MGYG000004899 MGYG000004899_32|CGC1 PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.... rhamnogalacturonan Bacteroides sp900555635 1

1915 rows × 6 columns

In [26]:
# View all cgc sequences in the subset
all_unsupervised["sig_gene_seq"]
Out[26]:
0                                   1.B.14,GH29,GH16,GH76
1                    1.B.14,HTH_AraC,GH92,GH76,GH130,GH92
2                                      GH18,8.A.46,1.B.14
3                                        GH13,GH97,1.B.14
5       2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_...
                              ...                        
3340    3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_...
3342                                   GT0,GT4,9.B.18,GT0
3343    2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2...
3344    1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8....
3345    PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1....
Name: sig_gene_seq, Length: 1915, dtype: object
In [27]:
# Write selected out a csv file
all_unsupervised.to_csv("unsupervised_cgc_selected.tsv", index=False, sep="\t")

2. Train and Test Subsets¶

In [28]:
# Train test split - Split arrays or matrices into random train and test subsets.
from sklearn.model_selection import train_test_split
In [29]:
# Train test split
# Split original data into 70% for clustering and 30% for finding the mappings
X_train, X_test, y_train, y_test = train_test_split(
    all_unsupervised["sig_gene_seq"],
    all_unsupervised["low_level_substr"],
    test_size=0.3,
    stratify=all_unsupervised["low_level_substr"].values,
)

## X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=array-like)
## test_size: the proportion of the dataset to include in the train split
## stratify: If not None, data is split in a stratified fashion, using this as the class labels.
In [30]:
# Supervised is for finding the mappings
X_supervised = pd.concat([X_test, y_test], 1).reset_index(drop=True)
X_supervised
/tmp/ipykernel_3088779/4208520920.py:2: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only
  X_supervised = pd.concat([X_test, y_test], 1).reset_index(drop=True)
Out[30]:
sig_gene_seq low_level_substr
0 GH2,GH84,2.A.37 rhamnogalacturonan
1 GH97,GH92,8.A.46,1.B.14 mucin
2 3.A.1,HTH_AraC,2.A.2,GH43_1,GH67 xylan
3 GT2,1.B.14,1.B.14,GH92,GH28,GH92,GH89 alpha-mannan
4 2.A.21,3.A.10,GH63,GH2,2.A.124,2.A.124,1.B.14 rhamnogalacturonan
... ... ...
570 2.A.114,GH141,PL12_2,1.B.14,1.B.14,GH29,HTH_Ar... host glycan
571 2.A.13,3.A.11,GH123
572 2.A.81,1.B.14,2.A.4,GH20 N-glycan
573 1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8.... host glycan
574 GH84,GH97,1.B.42,3.A.1,3.A.1 acetylated glucuronoxylan

575 rows × 2 columns

In [31]:
# View data for clustering
X_train
Out[31]:
737                               GH29,8.A.46,1.B.14,GH97
1478    2.A.21,GerE,FecR,1.B.14,TPR_1,PL35,GH88,GerE,F...
2366                              1.B.14,CBM32,GH2,2.A.88
170                 2.A.66,GT94,GT0,GT4,GT4,GT4,GT2,GH115
907     3.A.11,PL11_1,GH105,GH28,GH154,GH88,GH2,1.B.14...
                              ...                        
3148    PL8_3,PL38,GH88,1.B.14,GH29,GH31,GH2,PL38,GH10...
895                                        2.A.1,GH2,GH35
2223                                   GT2,2.A.66,GT4,GT2
2650    1.B.14,Sigma70_r2|Sigma70_r4_2,GH28,HTH_AraC,1...
113            GH2,GH53,8.A.46,1.B.14,GH147,HTH_AraC,PL13
Name: sig_gene_seq, Length: 1340, dtype: object

3. Convert words to vectors¶

In [32]:
# Countvectorizer
# To convert a collection of text documents to a vector of term/token counts
count_vectorizer = CountVectorizer(
    tokenizer=lambda x: str(x).replace("|", ",").split(","), lowercase=False
)
In [33]:
# Fit the count vectorizer (learn a vocabulary dictionary of all tokens in the raw documents)
count_vectorizer.fit(X_train)
/util/opt/anaconda/deployed-conda-envs/packages/biopython/envs/biopython-1.79-py39/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:489: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn("The parameter 'token_pattern' will not be used"
Out[33]:
CountVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x1489152db1f0>)
In [34]:
# Transform texts to sequences
count_vectorized_array = count_vectorizer.transform(X_train)
In [35]:
count_vectorized_array
# count_vectorized_array.toarray()
Out[35]:
<1340x415 sparse matrix of type '<class 'numpy.int64'>'
	with 7112 stored elements in Compressed Sparse Row format>
In [36]:
# vocabulary - dictionary containing the word (key) and vector (value)
vocabulary = count_vectorizer.vocabulary_
vocabulary
# View
first20vocabulary = {k: vocabulary[k] for k in sorted(vocabulary.keys())[:20]}
first20vocabulary
Out[36]:
{'1.A.1': 0,
 '1.A.13': 1,
 '1.A.22': 2,
 '1.A.23': 3,
 '1.A.26': 4,
 '1.A.28': 5,
 '1.A.30': 6,
 '1.A.33': 7,
 '1.A.34': 8,
 '1.A.35': 9,
 '1.A.62': 10,
 '1.A.78': 11,
 '1.B.14': 12,
 '1.B.17': 13,
 '1.B.18': 14,
 '1.B.44': 15,
 '1.B.5': 16,
 '1.B.6': 17,
 '1.B.9': 18,
 '1.C.105': 19}
In [37]:
# Inverse vocabulary
inv_map_vocab = {v: k for k, v in vocabulary.items()}
# View
first20_inv_vocabulary = {
    k: inv_map_vocab[k] for k in sorted(inv_map_vocab.keys())[:20]
}
first20_inv_vocabulary
Out[37]:
{0: '1.A.1',
 1: '1.A.13',
 2: '1.A.22',
 3: '1.A.23',
 4: '1.A.26',
 5: '1.A.28',
 6: '1.A.30',
 7: '1.A.33',
 8: '1.A.34',
 9: '1.A.35',
 10: '1.A.62',
 11: '1.A.78',
 12: '1.B.14',
 13: '1.B.17',
 14: '1.B.18',
 15: '1.B.44',
 16: '1.B.5',
 17: '1.B.6',
 18: '1.B.9',
 19: '1.C.105'}

4. CGC Clustering¶

In [38]:
# Affinity Propagation - a machine learning algorithm that can find data centers or clusters
# by sending messages between pairs of data points.
# does not need to know number of clusters
from sklearn.cluster import AffinityPropagation
In [39]:
# Fit the clustering algorithm on the unsupervised data
clustering = AffinityPropagation(random_state=5).fit(count_vectorized_array)
# random_state: pseudo-random number generator to control the starting state.
In [40]:
clustering.labels_
Out[40]:
array([ 58,  38,  11, ..., 143, 158,  40])
In [41]:
# For counting
# Counter: it is a collection where elements are stored as dictionary keys
# and their counts are stored as dictionary values.
from collections import Counter
In [42]:
# Count for number of clusters
cluster_freq_info = pd.DataFrame(Counter(clustering.labels_), index=[0]).T.reset_index()
# T: Transpose index and columns
# view
cluster_freq_info.shape
Out[42]:
(159, 2)
In [43]:
# Name columns and view
cluster_freq_info.columns = ["cluster_id", "number of samples"]
cluster_freq_info.head()
Out[43]:
cluster_id number of samples
0 58 33
1 38 14
2 11 28
3 62 20
4 69 5
In [44]:
# Select the top clusters for visualization
cluster_freq_info = cluster_freq_info.sort_values("number of samples", ascending=False)
In [45]:
cluster_freq_info
Out[45]:
cluster_id number of samples
50 104 59
13 1 36
0 58 33
48 70 33
17 88 32
... ... ...
7 0 1
27 3 1
81 29 1
83 30 1
158 150 1

159 rows × 2 columns

In [46]:
# We will create clusters to substrate mappings,
# convert strings to vector,
# pass the supervised strings through the count vectorizer

supervised_seqs_array = count_vectorizer.transform(X_supervised["sig_gene_seq"].values)
In [47]:
# supervised_seqs_array
supervised_seqs_array_dense = supervised_seqs_array.todense()
supervised_seqs_array_dense
Out[47]:
matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])
In [48]:
# Use the fit cluster object to predict clusters on the supervised set
supervised_set_clusters = clustering.predict(supervised_seqs_array_dense)
In [49]:
# Start to create the mapping
mapping_df = pd.DataFrame(X_supervised["low_level_substr"].values)
In [50]:
# Put the predicted clusters
mapping_df["cluster_id"] = supervised_set_clusters
In [51]:
# Sort and name the columns
mapping_df = mapping_df.sort_values("cluster_id")
mapping_df.columns = ["low_level_substr", "cluster_id"]
mapping_df
Out[51]:
low_level_substr cluster_id
162 xanthan 1
532 alpha-mannan 1
295 carrageenan 1
508 pectin 1
230 host glycan 1
... ... ...
25 rhamnogalacturonan 154
249 rhamnogalacturonan 154
487 xylan 155
175 sialoglycoconjugate 157
168 N-glycan 157

575 rows × 2 columns

In [52]:
# Aggregate substrate by using the cluster id
mapping_df = pd.DataFrame(
    mapping_df.groupby("cluster_id")["low_level_substr"].aggregate(pd.Series.mode)
).reset_index()
In [53]:
# Remove [] from low_level_substr column
catch = []
for seq in mapping_df["low_level_substr"].values:
    seq = str(seq).replace("]", "").replace("[", "")
    catch.append(seq)
In [54]:
mapping_df["low_level_substr"] = catch
mapping_df
Out[54]:
cluster_id low_level_substr
0 1 host glycan
1 2 mucin
2 5 capsule polysaccharide
3 7 xylan
4 8 rhamnogalacturonan
... ... ...
110 152 beta-glucan
111 153 mucin
112 154 rhamnogalacturonan
113 155 xylan
114 157 'N-glycan' 'sialoglycoconjugate'

115 rows × 2 columns

5. Visualization¶

In [55]:
# TSNE for visualization
# T-distributed Stochastic Neighbor Embedding - visualize high-dimensional data.
from sklearn.manifold import TSNE
In [56]:
# Covert sparse matrix
count_vectorized_array
Out[56]:
<1340x415 sparse matrix of type '<class 'numpy.int64'>'
	with 7112 stored elements in Compressed Sparse Row format>
In [57]:
# 3D plot
X_embedded = TSNE(
    n_components=3, init="pca"  # learning_rate='auto',init='random'
).fit_transform(count_vectorized_array.toarray())

# n_components: dimension of the embedded space.
# init: initialization of embedding.
# fit_transform: fit X into an embedded space and return that transformed output.
In [58]:
# View dataframe
X_embedded_df = pd.DataFrame(X_embedded)
X_embedded_df
Out[58]:
0 1 2
0 27.550228 1.689406 1.121049
1 29.111511 -17.498882 27.308899
2 12.939676 -5.533157 -23.785519
3 -83.159393 19.958351 5.421502
4 52.284691 -21.424486 -20.653650
... ... ... ...
1335 64.076935 3.201710 -38.197628
1336 -14.928308 25.385132 27.029896
1337 -67.314041 6.404743 18.647472
1338 -20.898037 -34.193462 -14.455879
1339 57.091476 -29.505133 21.557560

1340 rows × 3 columns

In [59]:
# Add cluster_id column to dataframe
X_embedded_df["cluster_id"] = clustering.labels_
X_embedded_df
Out[59]:
0 1 2 cluster_id
0 27.550228 1.689406 1.121049 58
1 29.111511 -17.498882 27.308899 38
2 12.939676 -5.533157 -23.785519 11
3 -83.159393 19.958351 5.421502 62
4 52.284691 -21.424486 -20.653650 69
... ... ... ... ...
1335 64.076935 3.201710 -38.197628 98
1336 -14.928308 25.385132 27.029896 88
1337 -67.314041 6.404743 18.647472 143
1338 -20.898037 -34.193462 -14.455879 158
1339 57.091476 -29.505133 21.557560 40

1340 rows × 4 columns

In [60]:
# Merge mapping_df with X_embedded_df by cluster_id
X_embedded_df = X_embedded_df.merge(mapping_df, how="left", on="cluster_id")
In [61]:
X_embedded_df
Out[61]:
0 1 2 cluster_id low_level_substr
0 27.550228 1.689406 1.121049 58 alpha-mannan
1 29.111511 -17.498882 27.308899 38 'glucose' 'homogalacturonan' 'host glycan' 'mu...
2 12.939676 -5.533157 -23.785519 11 host glycan
3 -83.159393 19.958351 5.421502 62 capsule polysaccharide
4 52.284691 -21.424486 -20.653650 69 NaN
... ... ... ... ... ...
1335 64.076935 3.201710 -38.197628 98 'N-glycan' 'host glycan'
1336 -14.928308 25.385132 27.029896 88 glycosaminoglycan
1337 -67.314041 6.404743 18.647472 143 capsule polysaccharide
1338 -20.898037 -34.193462 -14.455879 158 NaN
1339 57.091476 -29.505133 21.557560 40 'exopolysaccharide' 'porphyran'

1340 rows × 5 columns

In [62]:
# View top 10 substrates
X_embedded_df["low_level_substr"].value_counts().head(n=10)
Out[62]:
host glycan                  294
capsule polysaccharide       137
glycosaminoglycan             94
pectin                        71
alpha-mannan                  54
acetylated glucuronoxylan     48
homogalacturonan              46
xylan                         38
N-glycan                      34
beta-glucan                   31
Name: low_level_substr, dtype: int64
In [63]:
# Select top 10 substrates
top_10_classes = X_embedded_df["low_level_substr"].value_counts()[:10]
top_10_classes
Out[63]:
host glycan                  294
capsule polysaccharide       137
glycosaminoglycan             94
pectin                        71
alpha-mannan                  54
acetylated glucuronoxylan     48
homogalacturonan              46
xylan                         38
N-glycan                      34
beta-glucan                   31
Name: low_level_substr, dtype: int64
In [64]:
# Make top 10 substrates to list
top_10_classes_list = list(pd.DataFrame(top_10_classes).index)
top_10_classes_list
Out[64]:
['host glycan',
 'capsule polysaccharide',
 'glycosaminoglycan',
 'pectin',
 'alpha-mannan',
 'acetylated glucuronoxylan',
 'homogalacturonan',
 'xylan',
 'N-glycan',
 'beta-glucan']
In [65]:
# Subset X_embedded_df dataframe by list
X_embedded_df_viz = X_embedded_df[
    X_embedded_df["low_level_substr"].isin(top_10_classes_list)
]
X_embedded_df_viz
Out[65]:
0 1 2 cluster_id low_level_substr
0 27.550228 1.689406 1.121049 58 alpha-mannan
2 12.939676 -5.533157 -23.785519 11 host glycan
3 -83.159393 19.958351 5.421502 62 capsule polysaccharide
5 -40.710545 -0.388352 -24.416637 152 beta-glucan
6 2.397810 -11.162458 31.722109 17 host glycan
... ... ... ... ... ...
1327 -59.101582 -15.625198 -7.301012 103 xylan
1328 65.735535 20.191668 -23.182314 27 host glycan
1331 -12.281227 29.358931 13.767683 88 glycosaminoglycan
1336 -14.928308 25.385132 27.029896 88 glycosaminoglycan
1337 -67.314041 6.404743 18.647472 143 capsule polysaccharide

847 rows × 5 columns

In [66]:
# astype:change a pandas object to a specified dtype
X_embedded_df_viz["low_level_substr"] = X_embedded_df_viz["low_level_substr"].astype(str)
/tmp/ipykernel_3088779/3803832902.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_embedded_df_viz["low_level_substr"] = X_embedded_df_viz["low_level_substr"].astype(str)
In [67]:
# plotly - an interactive, open-source, and browser-based graphing library
import plotly.express as px
In [68]:
fig = px.scatter_3d(X_embedded_df_viz, x=0, y=1, z=2, color="low_level_substr")
fig.write_html("cgc_clusters_3D_top10.html")

You can not see the plot in ternimal, it is OK. You can download the file to your own computer and open it in your browser.

In [69]:
# view plot
fig.show()
In [70]:
# view X_embedded_df dataframe
X_embedded_df.shape
Out[70]:
(1340, 5)
In [71]:
# Creat a new dataframe - summarize cgc sequences, cluster ids and substrates
final_df_cgc_cluster_substrate = X_embedded_df[["low_level_substr", "cluster_id"]]
In [72]:
final_df_cgc_cluster_substrate["sig_gene_seq"] = X_train.values
final_df_cgc_cluster_substrate["sig_gene_seq"]
/tmp/ipykernel_3088779/3070682039.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[72]:
0                                 GH29,8.A.46,1.B.14,GH97
1       2.A.21,GerE,FecR,1.B.14,TPR_1,PL35,GH88,GerE,F...
2                                 1.B.14,CBM32,GH2,2.A.88
3                   2.A.66,GT94,GT0,GT4,GT4,GT4,GT2,GH115
4       3.A.11,PL11_1,GH105,GH28,GH154,GH88,GH2,1.B.14...
                              ...                        
1335    PL8_3,PL38,GH88,1.B.14,GH29,GH31,GH2,PL38,GH10...
1336                                       2.A.1,GH2,GH35
1337                                   GT2,2.A.66,GT4,GT2
1338    1.B.14,Sigma70_r2|Sigma70_r4_2,GH28,HTH_AraC,1...
1339           GH2,GH53,8.A.46,1.B.14,GH147,HTH_AraC,PL13
Name: sig_gene_seq, Length: 1340, dtype: object
In [73]:
# Name the columns
final_df_cgc_cluster_substrate[["sig_gene_seq", "cluster_id", "low_level_substr"]]
Out[73]:
sig_gene_seq cluster_id low_level_substr
0 GH29,8.A.46,1.B.14,GH97 58 alpha-mannan
1 2.A.21,GerE,FecR,1.B.14,TPR_1,PL35,GH88,GerE,F... 38 'glucose' 'homogalacturonan' 'host glycan' 'mu...
2 1.B.14,CBM32,GH2,2.A.88 11 host glycan
3 2.A.66,GT94,GT0,GT4,GT4,GT4,GT2,GH115 62 capsule polysaccharide
4 3.A.11,PL11_1,GH105,GH28,GH154,GH88,GH2,1.B.14... 69 NaN
... ... ... ...
1335 PL8_3,PL38,GH88,1.B.14,GH29,GH31,GH2,PL38,GH10... 98 'N-glycan' 'host glycan'
1336 2.A.1,GH2,GH35 88 glycosaminoglycan
1337 GT2,2.A.66,GT4,GT2 143 capsule polysaccharide
1338 1.B.14,Sigma70_r2|Sigma70_r4_2,GH28,HTH_AraC,1... 158 NaN
1339 GH2,GH53,8.A.46,1.B.14,GH147,HTH_AraC,PL13 40 'exopolysaccharide' 'porphyran'

1340 rows × 3 columns

In [74]:
# View substrate by using cluster id
mapping_df[mapping_df["cluster_id"] == 0]
Out[74]:
cluster_id low_level_substr
In [75]:
# View number of sequences by using cluster id
cluster_freq_info[cluster_freq_info["cluster_id"] == 68]
Out[75]:
cluster_id number of samples
137 68 1
In [76]:
final_df_cgc_cluster_substrate["cluster_id"].value_counts()[:10]
Out[76]:
104    59
1      36
58     33
70     33
88     32
11     28
22     28
14     27
144    25
124    24
Name: cluster_id, dtype: int64
In [77]:
# Write out a csv file
final_df_cgc_cluster_substrate.to_csv("cgc_cluster_substrate.tsv", index=False, sep="\t")
In [78]:
# 2D plot
X_embedded_2D = TSNE(n_components=2, init="pca").fit_transform(
    count_vectorized_array.toarray()
)

X_embedded_2D_df = pd.DataFrame(X_embedded_2D)
X_embedded_2D_df["cluster_id"] = clustering.labels_
X_embedded_2D_df = X_embedded_2D_df.merge(mapping_df, how="left", on="cluster_id")
X_embedded_2D_df
Out[78]:
0 1 cluster_id low_level_substr
0 15.473319 7.805315 58 alpha-mannan
1 9.921034 -25.687372 38 'glucose' 'homogalacturonan' 'host glycan' 'mu...
2 15.637207 -15.620399 11 host glycan
3 -65.089294 4.293285 62 capsule polysaccharide
4 38.521568 -17.407051 69 NaN
... ... ... ... ...
1335 33.397781 -14.693295 98 'N-glycan' 'host glycan'
1336 -11.899568 15.598096 88 glycosaminoglycan
1337 -57.790684 13.014769 143 capsule polysaccharide
1338 -7.240588 -38.635487 158 NaN
1339 34.799389 4.621681 40 'exopolysaccharide' 'porphyran'

1340 rows × 4 columns

In [79]:
top_10_classes = X_embedded_2D_df["low_level_substr"].value_counts()[:10]
top_10_classes_list = list(pd.DataFrame(top_10_classes).index)

X_embedded_2D_df_viz = X_embedded_df[
    X_embedded_2D_df["low_level_substr"].isin(top_10_classes_list)
]
X_embedded_2D_df_viz["low_level_substr"] = X_embedded_2D_df_viz[
    "low_level_substr"
].astype(str)

fig = px.scatter(X_embedded_2D_df_viz, x=0, y=1, color="low_level_substr")
fig.write_html("cgc_clusters_2D_top10.html")
fig.show()
/tmp/ipykernel_3088779/1996784998.py:7: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy